library(DESeq2)
library(ggplot2)
library(AnnotationDbi)
library(org.Hs.eg.db)
library(ComplexHeatmap)
library(clusterProfiler)
library(EnhancedVolcano)
library(fgsea)
library(magrittr)
library(tidyverse)
library(vsn)
library(pheatmap)
library(RColorBrewer)
library(edgeR)
library(matrixStats)
library(circlize)
setwd("/Users/sueannechear/Bioinformatics/rnaseq/iMac/quant/second_analysis/Endothelialcells github")
counts<-read.delim("cleancount_V3.csv",header=T, sep=",")
rownames(counts)  <- counts[,1]
counts<- counts[ -c(1) ]
head(counts,3)

Samples used in this analysis:

df<-data.frame("Abbreviation"=c("iPSC","HUVEC","iBMEC","HBMEC","BEPC","CEC"),
               "Cell"=c("Human induced pluripotent stem cells","Human umbilical vein endothelial cells","iPSC derived brain microvascular endothelial cells","Human brain microvascular endothelial cells","Bronchial epithelial cells","Colon epithelial cells"),
               "Source"=c("PRJNA754196","GSE57662","GSE129290","GSE97575","GSE85402","GSE82207"), "Replicates"=c(3,3,3,3,2,2))
print(df)
##   Abbreviation                                               Cell      Source
## 1         iPSC               Human induced pluripotent stem cells PRJNA754196
## 2        HUVEC             Human umbilical vein endothelial cells    GSE57662
## 3        iBMEC iPSC derived brain microvascular endothelial cells   GSE129290
## 4        HBMEC        Human brain microvascular endothelial cells    GSE97575
## 5         BEPC                         Bronchial epithelial cells    GSE85402
## 6          CEC                             Colon epithelial cells    GSE82207
##   Replicates
## 1          3
## 2          3
## 3          3
## 4          3
## 5          2
## 6          2

Create a DESeq dataset : 18 samples; 62753 genes

condition<-factor(c("HUVEC","HUVEC","HUVEC","iEC","iEC",
                    "BEPC","BEPC",
                    "iPSC","iPSC","iPSC","HBMEC","HBMEC","HBMEC",
                    "CEC","CEC","iBMEC","iBMEC","iBMEC"
))

sample<-factor(colnames(counts))

coldata<-data.frame(sample,condition)

dds<-DESeqDataSetFromMatrix(countData = counts,
                            colData = coldata,
                            design=~condition)

dds
## class: DESeqDataSet 
## dim: 62753 18 
## metadata(1): version
## assays(1): counts
## rownames(62753): ENSG00000000003 ENSG00000000005 ... ENSG00000292372
##   ENSG00000292373
## rowData names(0):
## colnames(18): HUVEC_1 HUVEC_2 ... iBMEC.1 iBMEC.2
## colData names(2): sample condition

We retain genes with at least 1 count per million (CPM) in at least two samples. Genes remained after filtering: 20362

dds = dds[ rowSums(edgeR::cpm(counts(dds)) > 1)>=2, ]

nrow(dds)
## [1] 20362

QC for dispersion of variability in data. Fitted line trend below 1, indicating the data is a good fit for the DESeq model.

dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
plotDispEsts(dds)

Transform data using VST method before PCA.

vstdata<-vst(dds,blind=F)

meanSdPlot(assay(vstdata), ranks=FALSE)

PCA: to examine variation between samples. Gnerally,HUVEC,iEC and HBMEC are clustered close together,indicating similar transcriptomics. PC1 clearly separates primary endothelial cells from other cell types. iPSC and epithelial cells are more similar in transcriptomics than they are with endothelial cells.

plotPCA(vstdata,intgroup="condition")

#check pc3 and pc4
plotPCA <- function (object, intgroup = "condition", ntop = 500, returnData = FALSE) 
{
    rv <- rowVars(assay(object))
    select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, 
        length(rv)))]
    pca <- prcomp(t(assay(object)[select, ]))
    percentVar <- pca$sdev^2/sum(pca$sdev^2)
    if (!all(intgroup %in% names(colData(object)))) {
        stop("the argument 'intgroup' should specify columns of colData(dds)")
    }
    intgroup.df <- as.data.frame(colData(object)[, intgroup, 
        drop = FALSE])
    group <- if (length(intgroup) > 1) {
        factor(apply(intgroup.df, 1, paste, collapse = " : "))
    }
    else {
        colData(object)[[intgroup]]
    }
    d <- data.frame(PC1 = pca$x[, 1], PC2 = pca$x[, 2], PC3 = pca$x[, 3], PC4 = pca$x[, 4], group = group, 
        intgroup.df, name = colnames(object))
    if (returnData) {
        attr(d, "percentVar") <- percentVar[1:2]
        return(d)
    }
    ggplot(data = d, aes_string(x = "PC3", y = "PC4", color = "group")) + 
        geom_point(size = 3) + xlab(paste0("PC3: ", round(percentVar[1] * 
        100), "% variance")) + ylab(paste0("PC4: ", round(percentVar[2] * 
        100), "% variance")) + coord_fixed()
}

#print(plotPCA(vstdata,intgroup="condition"))
ntop <- 500
rv <- rowVars(assay(vstdata))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
mat <- t( assay(vstdata)[select, ] )

pca<-prcomp(mat)
pca <- as.data.frame(pca$x)

Which genes impact the most in PC1?

getLoadings = function(dds){
  
  mat<-assay(vstdata)
  pca = prcomp(t(mat), retx = TRUE)
  
  return(pca$rotation)
}

loadings_vstdata = getLoadings(vstdata) %>% as.data.frame()
# Annotating gene names
loadings_vstdata$symbol = mapIds(org.Hs.eg.db,
                              keys=rownames(loadings_vstdata),
                              column="SYMBOL",
                              keytype="ENSEMBL",
                              multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
# show the top 10 genes from PC1
loadings_vstdata %>% 
  # select only the PCs we are interested in
  dplyr::select(symbol, PC1) %>%
  # convert to "long" format
  pivot_longer(cols = "PC1", names_to = "PC1", values_to = "loadings") %>% 
  # for PC1
  group_by(PC1) %>% 
  # arrange by descending order
  arrange(desc(abs(loadings))) %>% 
  # take the 10 top rows
  slice(1:20) %>%
  pull(symbol)
##  [1] "KRT8"    "KRT19"   "KRT18"   "SLC1A2"  "CD24"    "FAM107A" "SPARCL1"
##  [8] "GFAP"    "A2M"     "ADGRF5"  "CD34"    "ATP1B2"  "CDH1"    "SLC6A1" 
## [15] "VWF"     "CRABP2"  "PMP2"    "PECAM1"  "TOP2A"   "CLDN6"

Which genes impact the most in PC2?

# show the top 20 genes from PC2
loadings_vstdata %>% 
  # select only the PCs we are interested in
  dplyr::select(symbol, PC2) %>%
  # convert to "long" format
  pivot_longer(cols = "PC2", names_to = "PC2", values_to = "loadings") %>% 
  # for PC2
  group_by(PC2) %>% 
  # arrange by descending order
  arrange(desc(abs(loadings))) %>% 
  # take the 10 top rows
  slice(1:20) %>%
  pull(symbol)
##  [1] "MMP1"    "CD93"    "ANKRD1"  "MMRN1"   "ESM1"    "PECAM1"  "PTPRZ1" 
##  [8] "KIF1A"   "THBS1"   "ATP1A2"  "PTX3"    "CLEC14A" "EFEMP1"  "CDH5"   
## [15] "PLP1"    "PLVAP"   "SOX18"   "CLDN6"   "CDH1"    "MMRN2"

Cluster dendrogram

rv <- rowVars(assay(vstdata))
o <- order(rv,decreasing=TRUE)
dists <- dist(t(assay(vstdata)[head(o,500),]))
hc <- hclust(dists)
plot(hc, labels=vstdata$sample)

Correlation matrix heat map of transcript expression across all samples. Euclidean distance is used as the similarity measure and clustering samples based on the ‘complete’ method. The lower the numbers, the stronger the correlation between samples.

sampleDists<-dist(t(assay(vstdata)))
sampleDistMatrix<-as.matrix(sampleDists)
colnames(sampleDistMatrix)

colors<-colorRampPalette(rev(brewer.pal(9,"Blues")))(255)

pheatmap(sampleDistMatrix, clustering_distance_rows = sampleDists, display_numbers = TRUE,
         clustering_distance_cols=sampleDists, col=colors, fontsize_number=7)

Heatmap to visualize clustering using top 100 genes.

#get the indices of the top variable genes
topVarGenes <- head(order(rowVars(assay(vstdata)), decreasing = TRUE), 100)

#subset the data matrix to include only the top variable genes
mat  <- assay(vstdata)[ topVarGenes, ]


#center the data
mat  <- mat - rowMeans(mat)
anno <- as.data.frame(colData(vstdata))

#map ensembl IDs to gene symbols
symbols <- mapIds(org.Hs.eg.db, keys = rownames(mat), column = "SYMBOL", keytype = "ENSEMBL")

symbols <- symbols[!is.na(symbols)]
  symbols <- symbols[match(rownames(mat), names(symbols))]
  rownames(mat) <- symbols
  keep <- !is.na(rownames(mat))
  mat <- mat[keep,]


#create a heatmap with hierarchical clustering
#heatmap_result <- pheatmap(mat, annotation_col = anno, fontsize_row=5)
# Perform hierarchical clustering separately
hc_rows <- hclust(dist(mat), method = "complete")

# Create a heatmap with hierarchical clustering
#heatmap_result <- pheatmap(mat, annotation_col = anno, fontsize_row = 5, clustering_distance_rows = "correlation")

heatmap_result <- pheatmap(mat, fontsize_row = 5, clustering_distance_rows = "correlation")

# Extract the cluster assignments for the rows
cluster_assignments <- cutree(hc_rows, k = 2)

# Print or use the cluster assignments as needed
print(cluster_assignments)

# Create a data frame with gene symbols and cluster assignments
gene_cluster_df <- data.frame(
  GeneSymbol = names(cluster_assignments),
  ClusterAssignment = cluster_assignments
)

# Order the data frame by cluster assignments
gene_cluster_df <- gene_cluster_df[order(gene_cluster_df$ClusterAssignment, gene_cluster_df$GeneSymbol), ]

# Print or use the data frame as needed
print(gene_cluster_df)

write.csv(gene_cluster_df, "genecluster.csv",row.names=F)

Pathways enriched in cluster 1 :these are genes upregulated in endothelial cells.

# Select genes in Cluster 1
genes_in_cluster1 <- gene_cluster_df$GeneSymbol[gene_cluster_df$ClusterAssignment == 1]

print(genes_in_cluster1)

entrez_ids <- mapIds(org.Hs.eg.db, keys = genes_in_cluster1, keytype = "SYMBOL", column = "ENTREZID")

GO_results <- enrichGO(gene = genes_in_cluster1, OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", ont = "BP")

as.data.frame(GO_results)


plot(barplot(GO_results, showCategory = 15,cex.names = 0.5))

Pathways enriched in cluster 2; these are genes upregulated in non-endothelial cells.

# Select genes in Cluster 2
genes_in_cluster2 <- gene_cluster_df$GeneSymbol[gene_cluster_df$ClusterAssignment == 2]

print(genes_in_cluster2)

entrez_ids <- mapIds(org.Hs.eg.db, keys = genes_in_cluster2, keytype = "SYMBOL", column = "ENTREZID")

GO_results2 <- enrichGO(gene = genes_in_cluster2, OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", ont = "BP")

as.data.frame(GO_results2)


plot(barplot(GO_results2, showCategory = 15,cex.names = 0.5))

Differential expression of gene analysis:

iEC vs iBMEC. (padjusted value=0.05)

pds<-dds

pds$condition<-relevel(pds$condition, ref="iBMEC")

pds<-DESeq(pds)

res = results(pds, contrast=c("condition","iEC","iBMEC"), alpha=0.05)

summary(res)
## 
## out of 20362 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 3358, 16%
## LFC < 0 (down)     : 4922, 24%
## outliers [1]       : 1444, 7.1%
## low counts [2]     : 0, 0%
## (mean count < 3)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
sigs<-na.omit(res)

sigs.df<-as.data.frame(sigs)

columns(org.Hs.eg.db)

sigs.df$symbol<-mapIds(org.Hs.eg.db, keys=rownames(sigs.df), keytype = "ENSEMBL",column="SYMBOL")

sigs.df<-sigs.df%>%filter(!str_detect(symbol,'NA'))

Retrieve gene ontology terms associated with upregulated genes in iEC samples

genes_to_test <- rownames(sigs[sigs$log2FoldChange >5,])

GO_results <- enrichGO(gene = genes_to_test, OrgDb = "org.Hs.eg.db", keyType = "ENSEMBL", ont = "BP")

as.data.frame(GO_results)


plot(barplot(GO_results, showCategory = 15))

Retrieve gene ontology terms associated with downregulated genes in iEC samples

genes_to_test <- rownames(sigs[sigs$log2FoldChange < -2,])

GO_results <- enrichGO(gene = genes_to_test, OrgDb = "org.Hs.eg.db", keyType = "ENSEMBL", ont = "BP")

as.data.frame(GO_results)


plot(barplot(GO_results, showCategory = 15))

iBMEC vs iPSC. (padjusted value=0.05)

pds<-dds

pds$condition<-relevel(pds$condition, ref="iPSC")

pds<-DESeq(pds)

res = results(pds, contrast=c("condition","iBMEC","iPSC"), alpha=0.05)

summary(res)
## 
## out of 20362 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 4182, 21%
## LFC < 0 (down)     : 4232, 21%
## outliers [1]       : 1448, 7.1%
## low counts [2]     : 0, 0%
## (mean count < 3)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
sigs<-na.omit(res)

sigs.df<-as.data.frame(sigs)

columns(org.Hs.eg.db)

sigs.df$symbol<-mapIds(org.Hs.eg.db, keys=rownames(sigs.df), keytype = "ENSEMBL",column="SYMBOL")

sigs.df<-sigs.df%>%filter(!str_detect(symbol,'NA'))

Retrieve gene ontology terms associated with upregulated genes in iBMEC samples

genes_to_test <- rownames(sigs[sigs$log2FoldChange >5,])

GO_results <- enrichGO(gene = genes_to_test, OrgDb = "org.Hs.eg.db", keyType = "ENSEMBL", ont = "BP")

as.data.frame(GO_results)


plot(barplot(GO_results, showCategory = 15))

Retrieve gene ontology terms associated with downregulated genes in iBMEC samples

genes_to_test <- rownames(sigs[sigs$log2FoldChange < -2,])

GO_results <- enrichGO(gene = genes_to_test, OrgDb = "org.Hs.eg.db", keyType = "ENSEMBL", ont = "BP")

as.data.frame(GO_results)


plot(barplot(GO_results, showCategory = 15))

Heatmap expression of various cell marker genes

# Extract the transformed data matrix
transformed_matrix <- assay(vstdata)

# Convert the matrix to a data frame
transformed_df <- as.data.frame(transformed_matrix)

# Extract row names (Ensembl IDs)
ensembl_ids <- rownames(transformed_df)

# Map Ensembl IDs to gene symbols using org.Hs.eg.db
symbols <- mapIds(org.Hs.eg.db, keys = ensembl_ids, column = "SYMBOL", keytype = "ENSEMBL")

# Add gene symbols as a new column to the data frame
transformed_df$Symbol <- symbols[match(ensembl_ids, names(symbols))]

# Remove the initial row names
rownames(transformed_df) <- NULL

#save as csv
write.csv(transformed_df, "forheatmap.csv",row.names=F,sep='\t')
df<-read.delim("forheatmap.csv",header=T, sep=",")
head(df)


df2<-df%>%filter(Symbol%in% c("TMEM119","P2RY12","HEXB","FCRLS","SALL1","C1Q","GPR34","OLFML3","MERTK",
  "PROS1","TYRO3","TGFBR1","CD31","NG2","PDGFRB","CD146","NESTIN","VWF",
  "ACE","ADAMTS13","PECAM1","VCAM1","ICAM1","ICAM2","CD47","SELE","SLP",
  "CDH5","NECTIN2","ESAM","LEF1","FZD3","NOTUM","APCDD1","AXIN2","DIXDC1",
  "TNFRSF19","MECOM")) 

head(df2)

df3 <- df2 %>% 
  column_to_rownames(var = "Symbol")

#set a color scheme
#colors<-colorRampPalette(rev(brewer.pal(9,"Blues")))(255)

pheatmap(df3, scale="row",display_numbers=TRUE,fontsize_number=7,color = colorRampPalette(rev(c("#D73027", "#FC8D59", 
        "#FEE090", "#FFFFBF", "#E0F3F8", "#91BFDB", "#4575B4")))(100))

pheatmap(df3, cluster_cols=FALSE,  display_numbers=TRUE,fontsize_number=7,
          color = colorRampPalette(rev(c("#D73027", "#FC8D59", 
        "#FEE090", "#FFFFBF", "#E0F3F8", "#91BFDB", "#4575B4")))(100))

pheatmap(df3, cluster_cols=FALSE, cluster_rows = FALSE, display_numbers=TRUE,fontsize_number=7,
          color = colorRampPalette(rev(c("#D73027", "#FC8D59", 
        "#FEE090", "#FFFFBF", "#E0F3F8", "#91BFDB", "#4575B4")))(100))